{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pandas import Series, DataFrame\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1 pandas数据结构介绍\n",
    "\n",
    "  1.1 序列（Series）\n",
    "    - 索引\n",
    "    - 分片\n",
    "    - 字典操作\n",
    "    - 索引自动对齐运算\n",
    "    - name 属性\n",
    "  1.2 数据框（DataFrame）\n",
    "    - columns\n",
    "    - index\n",
    "    - 索引和切片\n",
    "    - 弹出（pop）\n",
    "    - .T 可以转置数据框\n",
    "  1.3 Panel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2 索引对象 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = Series(range(3), index=['a', 'b', 'c'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['a', 'b', 'c'], dtype='object')"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['b'], dtype='object')"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s.index[1:2]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "索引对象不允许被修改 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "ss = Series(range(3, 6), index=s.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ss.index is s.index"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3 核心的基本函数\n",
    "\n",
    "- .head()\n",
    "- .tail()\n",
    "- .shape\n",
    "- .values\n",
    "- .reindex() 改变索引，创建一个新的对象"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4 索引和旋转\n",
    "\n",
    "object.ix[] 语法支持整数和标签混合使用进行索引，虽然语法很强大，但这种索引方法常常让使用者感到困惑。我们更喜欢更加严格的[]、iloc、loc语法进行索引。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "dates = pd.date_range('2017-5-15', periods=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['a', 'b', 'c', 'd'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2017-05-15</td>\n",
       "      <td>0.422582</td>\n",
       "      <td>2.192212</td>\n",
       "      <td>-0.195834</td>\n",
       "      <td>1.043583</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-16</td>\n",
       "      <td>0.058919</td>\n",
       "      <td>0.415707</td>\n",
       "      <td>0.387199</td>\n",
       "      <td>0.157099</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-17</td>\n",
       "      <td>0.389779</td>\n",
       "      <td>-0.725836</td>\n",
       "      <td>-1.421320</td>\n",
       "      <td>0.872132</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-18</td>\n",
       "      <td>1.899611</td>\n",
       "      <td>1.075165</td>\n",
       "      <td>0.363440</td>\n",
       "      <td>0.349861</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-19</td>\n",
       "      <td>-1.620429</td>\n",
       "      <td>1.064473</td>\n",
       "      <td>-0.930995</td>\n",
       "      <td>-1.439978</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-20</td>\n",
       "      <td>0.874462</td>\n",
       "      <td>0.673706</td>\n",
       "      <td>-0.119880</td>\n",
       "      <td>1.425492</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-21</td>\n",
       "      <td>-1.403869</td>\n",
       "      <td>-0.182207</td>\n",
       "      <td>3.523592</td>\n",
       "      <td>-0.152446</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-22</td>\n",
       "      <td>-1.388238</td>\n",
       "      <td>0.086740</td>\n",
       "      <td>-1.035970</td>\n",
       "      <td>-0.028374</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   a         b         c         d\n",
       "2017-05-15  0.422582  2.192212 -0.195834  1.043583\n",
       "2017-05-16  0.058919  0.415707  0.387199  0.157099\n",
       "2017-05-17  0.389779 -0.725836 -1.421320  0.872132\n",
       "2017-05-18  1.899611  1.075165  0.363440  0.349861\n",
       "2017-05-19 -1.620429  1.064473 -0.930995 -1.439978\n",
       "2017-05-20  0.874462  0.673706 -0.119880  1.425492\n",
       "2017-05-21 -1.403869 -0.182207  3.523592 -0.152446\n",
       "2017-05-22 -1.388238  0.086740 -1.035970 -0.028374"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "s = df['a']  # []索引"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2017-05-15    0.422582\n",
       "2017-05-16    0.058919\n",
       "2017-05-17    0.389779\n",
       "2017-05-18    1.899611\n",
       "2017-05-19   -1.620429\n",
       "2017-05-20    0.874462\n",
       "2017-05-21   -1.403869\n",
       "2017-05-22   -1.388238\n",
       "Freq: D, Name: a, dtype: float64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2017-05-15</td>\n",
       "      <td>0.422582</td>\n",
       "      <td>2.192212</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-16</td>\n",
       "      <td>0.058919</td>\n",
       "      <td>0.415707</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-17</td>\n",
       "      <td>0.389779</td>\n",
       "      <td>-0.725836</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-18</td>\n",
       "      <td>1.899611</td>\n",
       "      <td>1.075165</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-19</td>\n",
       "      <td>-1.620429</td>\n",
       "      <td>1.064473</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-20</td>\n",
       "      <td>0.874462</td>\n",
       "      <td>0.673706</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-21</td>\n",
       "      <td>-1.403869</td>\n",
       "      <td>-0.182207</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-22</td>\n",
       "      <td>-1.388238</td>\n",
       "      <td>0.086740</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   a         b\n",
       "2017-05-15  0.422582  2.192212\n",
       "2017-05-16  0.058919  0.415707\n",
       "2017-05-17  0.389779 -0.725836\n",
       "2017-05-18  1.899611  1.075165\n",
       "2017-05-19 -1.620429  1.064473\n",
       "2017-05-20  0.874462  0.673706\n",
       "2017-05-21 -1.403869 -0.182207\n",
       "2017-05-22 -1.388238  0.086740"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[['a', 'b']]  #索引多列"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "loc 语法是严格基于标签的，其接收的标签对象包括：\n",
    "  - 一个标签\n",
    "  - 一列标签， ['a', 'b']\n",
    "  - 标签的一个分片 'a':'f'，标签的开始和结束位置都会包括在内。\n",
    "  - 单参的可调用对象"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2017-05-15</td>\n",
       "      <td>0.422582</td>\n",
       "      <td>2.192212</td>\n",
       "      <td>-0.195834</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-16</td>\n",
       "      <td>0.058919</td>\n",
       "      <td>0.415707</td>\n",
       "      <td>0.387199</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-17</td>\n",
       "      <td>0.389779</td>\n",
       "      <td>-0.725836</td>\n",
       "      <td>-1.421320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-18</td>\n",
       "      <td>1.899611</td>\n",
       "      <td>1.075165</td>\n",
       "      <td>0.363440</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-19</td>\n",
       "      <td>-1.620429</td>\n",
       "      <td>1.064473</td>\n",
       "      <td>-0.930995</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-20</td>\n",
       "      <td>0.874462</td>\n",
       "      <td>0.673706</td>\n",
       "      <td>-0.119880</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-21</td>\n",
       "      <td>-1.403869</td>\n",
       "      <td>-0.182207</td>\n",
       "      <td>3.523592</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-22</td>\n",
       "      <td>-1.388238</td>\n",
       "      <td>0.086740</td>\n",
       "      <td>-1.035970</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   a         b         c\n",
       "2017-05-15  0.422582  2.192212 -0.195834\n",
       "2017-05-16  0.058919  0.415707  0.387199\n",
       "2017-05-17  0.389779 -0.725836 -1.421320\n",
       "2017-05-18  1.899611  1.075165  0.363440\n",
       "2017-05-19 -1.620429  1.064473 -0.930995\n",
       "2017-05-20  0.874462  0.673706 -0.119880\n",
       "2017-05-21 -1.403869 -0.182207  3.523592\n",
       "2017-05-22 -1.388238  0.086740 -1.035970"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.loc[:, 'a':'c']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "iloc语法是基于位置的，可以使用布尔数组进行筛选。如果请求的索引超出数据框的边界，抛出IndexError异常。其接收的参数包括：\n",
    "  -  一个整数，如 5\n",
    "  - 一个整数序列，[1, 2, 3]\n",
    "  - 一个分片对象，如 1:7\n",
    "  - 一个布尔数组\n",
    "  - 单参数的可调用函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>2017-05-15</td>\n",
       "      <td>0.422582</td>\n",
       "      <td>2.192212</td>\n",
       "      <td>-0.195834</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-16</td>\n",
       "      <td>0.058919</td>\n",
       "      <td>0.415707</td>\n",
       "      <td>0.387199</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-17</td>\n",
       "      <td>0.389779</td>\n",
       "      <td>-0.725836</td>\n",
       "      <td>-1.421320</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2017-05-18</td>\n",
       "      <td>1.899611</td>\n",
       "      <td>1.075165</td>\n",
       "      <td>0.363440</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   a         b         c\n",
       "2017-05-15  0.422582  2.192212 -0.195834\n",
       "2017-05-16  0.058919  0.415707  0.387199\n",
       "2017-05-17  0.389779 -0.725836 -1.421320\n",
       "2017-05-18  1.899611  1.075165  0.363440"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.iloc[:4, :3] # 前4行和前3列"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5 算术运算与对齐"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "a = Series([1, 2, 3, 4], index=['d', 'b', 'c', 'a'])\n",
    "b = Series([5, 6, 7, 8], index=['b', 'd', 'e', 'f'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "a    NaN\n",
       "b    7.0\n",
       "c    NaN\n",
       "d    7.0\n",
       "e    NaN\n",
       "f    NaN\n",
       "dtype: float64"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "a+b # 公共的索引才计算，不同的为NaN"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    " - DataFrame + DataFrame也同样使用上面的规则（行列相同的计算，不同的为NaN）。\n",
    " - DataFrame + Series 会使用到数组扩充的转换机制。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6 数据清洗\n",
    "\n",
    "## 6.1 处理默认值\n",
    "\n",
    "np.nan和Python中的None也被认为是默认值。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = Series([np.nan, None, 'cat', 'dog'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     NaN\n",
       "1    None\n",
       "2     cat\n",
       "3     dog\n",
       "dtype: object"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0     True\n",
       "1     True\n",
       "2    False\n",
       "3    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.isnull()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "False"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.nan ==np.nan"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用 fillna() 函数填充默认值。 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], columns=['one', 'two', 'three'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.reindex(['a', 'b', 'c', 'd', 'e'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>three</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>a</td>\n",
       "      <td>1.524755</td>\n",
       "      <td>1.875874</td>\n",
       "      <td>-1.222462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>b</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>c</td>\n",
       "      <td>0.183745</td>\n",
       "      <td>-2.091938</td>\n",
       "      <td>-2.124474</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>d</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>e</td>\n",
       "      <td>0.520913</td>\n",
       "      <td>-1.275132</td>\n",
       "      <td>-0.273627</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        one       two     three\n",
       "a  1.524755  1.875874 -1.222462\n",
       "b       NaN       NaN       NaN\n",
       "c  0.183745 -2.091938 -2.124474\n",
       "d       NaN       NaN       NaN\n",
       "e  0.520913 -1.275132 -0.273627"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>three</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>a</td>\n",
       "      <td>1.524755</td>\n",
       "      <td>1.875874</td>\n",
       "      <td>-1.222462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>b</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>c</td>\n",
       "      <td>0.183745</td>\n",
       "      <td>-2.091938</td>\n",
       "      <td>-2.124474</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>d</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>e</td>\n",
       "      <td>0.520913</td>\n",
       "      <td>-1.275132</td>\n",
       "      <td>-0.273627</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        one       two     three\n",
       "a  1.524755  1.875874 -1.222462\n",
       "b  1.000000  1.000000  1.000000\n",
       "c  0.183745 -2.091938 -2.124474\n",
       "d  1.000000  1.000000  1.000000\n",
       "e  0.520913 -1.275132 -0.273627"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.fillna(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.loc['e'] = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>three</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>a</td>\n",
       "      <td>1.524755</td>\n",
       "      <td>1.875874</td>\n",
       "      <td>-1.222462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>b</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>c</td>\n",
       "      <td>0.183745</td>\n",
       "      <td>-2.091938</td>\n",
       "      <td>-2.124474</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>d</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>e</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        one       two     three\n",
       "a  1.524755  1.875874 -1.222462\n",
       "b       NaN       NaN       NaN\n",
       "c  0.183745 -2.091938 -2.124474\n",
       "d       NaN       NaN       NaN\n",
       "e       NaN       NaN       NaN"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>three</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>a</td>\n",
       "      <td>1.524755</td>\n",
       "      <td>1.875874</td>\n",
       "      <td>-1.222462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>b</td>\n",
       "      <td>1.524755</td>\n",
       "      <td>1.875874</td>\n",
       "      <td>-1.222462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>c</td>\n",
       "      <td>0.183745</td>\n",
       "      <td>-2.091938</td>\n",
       "      <td>-2.124474</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>d</td>\n",
       "      <td>0.183745</td>\n",
       "      <td>-2.091938</td>\n",
       "      <td>-2.124474</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>e</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        one       two     three\n",
       "a  1.524755  1.875874 -1.222462\n",
       "b  1.524755  1.875874 -1.222462\n",
       "c  0.183745 -2.091938 -2.124474\n",
       "d  0.183745 -2.091938 -2.124474\n",
       "e       NaN       NaN       NaN"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.fillna(method='pad', limit=1)  # 使用该行的上一行填充数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "使用 dropna() 丢弃掉默认值的行或者列。 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>one</th>\n",
       "      <th>two</th>\n",
       "      <th>three</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>a</td>\n",
       "      <td>1.524755</td>\n",
       "      <td>1.875874</td>\n",
       "      <td>-1.222462</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>c</td>\n",
       "      <td>0.183745</td>\n",
       "      <td>-2.091938</td>\n",
       "      <td>-2.124474</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        one       two     three\n",
       "a  1.524755  1.875874 -1.222462\n",
       "c  0.183745 -2.091938 -2.124474"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna(axis=0) # drop行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>a</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>b</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>c</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>d</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>e</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: []\n",
       "Index: [a, b, c, d, e]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna(axis=1) # drop列"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`interpolate() `提供了多种插值的方法，平方插值、立方插值等。 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.2 数据排序\n",
    "\n",
    "`DataFrame.sort_values(by, ascending=True, inplace=False)`\n",
    "\n",
    "- by: 根据某些列排序\n",
    "- ascending： 是否升序（默认True）\n",
    "- inplace : 是否直接修改原数据\n",
    "\n",
    "eg:\n",
    "\n",
    "`df.sort_values(by=['age', 'name'], ascending=[True, False])`"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.3 重复数据处理\n",
    "\n",
    "查找重复数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(\n",
    "    data={\n",
    "            \"id\": [1, 1, 3, 4, 5],\n",
    "            \"name\": [\"刘一\", \"刘一\", \"张三\", \"李四\", \"王五\"],\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1     True\n",
       "2    False\n",
       "3    False\n",
       "4    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.duplicated(keep=\"first\")  # 默认是所有数据相同才是重复数据, 保留第一个"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0    False\n",
       "1     True\n",
       "2    False\n",
       "3    False\n",
       "4    False\n",
       "dtype: bool"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.duplicated(['id'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**取出重复数据**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>刘一</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id name\n",
       "1   1   刘一"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.duplicated(['id'])]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "删除重复数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>刘一</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>张三</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>李四</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>王五</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id name\n",
       "0   1   刘一\n",
       "2   3   张三\n",
       "3   4   李四\n",
       "4   5   王五"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
